This document shows how I used the newly-written
vultureUtils package, along with real vulture movement data
from the year 2021 (co-feeding network only!) to explore and
parameterize my toy rewiring model.
Now I’m going to use these data to create networks and do some calculations, in order to parameterize the model.
fivedays <- vultureUtils::makeGraphs(edges = southernEdges_20200101_20220430, interval = "5 days",
dateTimeStart = "2020-01-01 00:00:00",
dateTimeEnd = "2022-04-30 11:59:00",
weighted = FALSE, allVertices = TRUE)$graphs
# Using the 5-day interval graph
fivedays_reduced <- lapply(fivedays, function(x){
delete.vertices(x, degree(x) < 1)
})
dates <- names(fivedays_reduced)
verts <- map2(.x = fivedays_reduced, .y = dates, .f = function(.x, .y){
names(igraph::V(.x)) %>%
as.data.frame() %>%
mutate(earlyDate = lubridate::ymd(.y)) %>%
rename("trackId" = ".")
})
allVerts <- data.table::rbindlist(verts) %>%
as.data.frame()
# Are there any individuals that disappeared before the end of the study or appeared late?
minmax <- southernPoints_20200101_20220430 %>%
dplyr::filter(trackId %in% allVerts$trackId) %>%
sf::st_drop_geometry() %>%
dplyr::select(trackId, timestamp) %>%
group_by(trackId) %>%
summarize(minTime = min(timestamp),
maxTime = max(timestamp))
# Do we have individuals that stopped being tracked partway through the year, or that didn't start being tracked until later?
minmax %>%
ggplot(aes(x = minTime))+
geom_histogram() # huh, looks like a lot of individuals didn't start being tracked until October. I guess a lot of tagging happened then. I wonder if I need to adjust the time window.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
minmax %>%
ggplot(aes(x = maxTime))+
geom_histogram() # a few stopped early but not many.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# Let's view a full time graph of each individual.
head(southernPoints_20200101_20220430)
## Simple feature collection with 6 features and 36 fields
## Geometry type: POINT
## Dimension: XY
## Bounding box: xmin: 34.81111 ymin: 30.85918 xmax: 34.81217 ymax: 30.86111
## Geodetic CRS: WGS 84
## tag_id external_temperature gps_hdop gps_satellite_count gps_time_to_fix
## 1 1264172250 29 1.6 5 11.52
## 2 1264172250 33 1.7 5 11.52
## 3 1264172250 35 1.9 4 11.53
## 4 1264172250 37 1.9 4 11.52
## 5 1264172250 38 1.3 6 34.36
## 6 1264172250 39 1.4 6 11.54
## ground_speed heading height_above_msl import_marked_outlier light_level
## 1 0.0000000 102 495 false 829
## 2 0.0000000 248 500 false 2043
## 3 0.0000000 283 505 false 1608
## 4 0.2777778 183 506 false 1797
## 5 0.0000000 337 523 false 1127
## 6 0.0000000 249 502 false 972
## location_lat location_long timestamp update_ts
## 1 30.86099 34.81205 2020-09-08 05:08:56 2020-09-28 15:29:53.687
## 2 30.86106 34.81215 2020-09-08 05:18:56 2020-09-28 15:29:53.687
## 3 30.86111 34.81213 2020-09-08 05:28:56 2020-09-28 15:29:53.687
## 4 30.86109 34.81217 2020-09-08 05:38:56 2020-09-28 15:29:53.687
## 5 30.85918 34.81111 2020-09-08 05:49:19 2020-09-28 15:29:53.687
## 6 30.85920 34.81114 2020-09-08 05:58:56 2020-09-28 15:29:53.687
## visible deployment_id event_id tag_local_identifier location_long.1
## 1 true 1265763554 16422113109 202381 34.81205
## 2 true 1265763554 16422113110 202381 34.81215
## 3 true 1265763554 16422113111 202381 34.81213
## 4 true 1265763554 16422113112 202381 34.81217
## 5 true 1265763554 16422113113 202381 34.81111
## 6 true 1265763554 16422113114 202381 34.81114
## location_lat.1 trackId comments death_comments
## 1 30.86099 A00w Sde Boker wild caught
## 2 30.86106 A00w Sde Boker wild caught
## 3 30.86111 A00w Sde Boker wild caught
## 4 30.86109 A00w Sde Boker wild caught
## 5 30.85918 A00w Sde Boker wild caught
## 6 30.85920 A00w Sde Boker wild caught
## exact_date_of_birth individual_id latest_date_born local_identifier
## 1 2013-01-01 00:00:00.000 1265763523 A00w
## 2 2013-01-01 00:00:00.000 1265763523 A00w
## 3 2013-01-01 00:00:00.000 1265763523 A00w
## 4 2013-01-01 00:00:00.000 1265763523 A00w
## 5 2013-01-01 00:00:00.000 1265763523 A00w
## 6 2013-01-01 00:00:00.000 1265763523 A00w
## ring_id sex timestamp_start timestamp_end
## 1 E88 Yellow 2020-09-08 03:49:35.000 2022-07-07 16:15:07.000
## 2 E88 Yellow 2020-09-08 03:49:35.000 2022-07-07 16:15:07.000
## 3 E88 Yellow 2020-09-08 03:49:35.000 2022-07-07 16:15:07.000
## 4 E88 Yellow 2020-09-08 03:49:35.000 2022-07-07 16:15:07.000
## 5 E88 Yellow 2020-09-08 03:49:35.000 2022-07-07 16:15:07.000
## 6 E88 Yellow 2020-09-08 03:49:35.000 2022-07-07 16:15:07.000
## number_of_events number_of_deployments sensor_type_ids taxon_detail
## 1 33700 1 GPS NA
## 2 33700 1 GPS NA
## 3 33700 1 GPS NA
## 4 33700 1 GPS NA
## 5 33700 1 GPS NA
## 6 33700 1 GPS NA
## dateOnly geometry
## 1 2020-09-08 POINT (34.81205 30.86099)
## 2 2020-09-08 POINT (34.81215 30.86106)
## 3 2020-09-08 POINT (34.81213 30.86111)
## 4 2020-09-08 POINT (34.81217 30.86109)
## 5 2020-09-08 POINT (34.81111 30.85918)
## 6 2020-09-08 POINT (34.81114 30.8592)
timeseries <- southernPoints_20200101_20220430 %>%
sf::st_drop_geometry() %>%
dplyr::select(trackId, dateOnly) %>%
dplyr::distinct()
# get order, and plot
order <- timeseries %>%
group_by(trackId) %>%
summarize(n = n()) %>%
arrange(n) %>%
pull(trackId)
timeseries %>%
mutate(trackId = factor(trackId, levels = order)) %>%
ggplot(aes(x = dateOnly, y = trackId))+
geom_point(size = 1)
# See what it looks like if we restrict the dates to 2020-01-01 through 2021-09-01
startDate <- "2020-01-01"
endDate <- "2021-09-01"
timeseries %>%
filter(dateOnly > lubridate::ymd(startDate) & dateOnly < lubridate::ymd(endDate)) %>%
mutate(trackId = factor(trackId, levels = order)) %>%
ggplot(aes(x = dateOnly, y = trackId))+
geom_point(size = 1)
# Now remove a few individuals not observed both before and after February 2021
toKeep <- timeseries %>%
group_by(trackId) %>%
summarize(min = min(dateOnly),
max = max(dateOnly)) %>%
filter(min < lubridate::ymd("2021-02-01") & max > lubridate::ymd("2021-02-01")) %>%
pull(trackId)
# Get usable edges, removing the individuals that don't fall within those constraints, and removing bad dates.
toUse <- southernEdges_20200101_20220430 %>%
filter(ID1 %in% toKeep | ID2 %in% toKeep,
minTimestamp > lubridate::ymd(startDate),
maxTimestamp < lubridate::ymd(endDate))
Going to do a sensitivity analysis over various increments, ranging from 1 to 31 days.
# First, create a range of plausible day increments. We don't want to integrate the networks over more than a month or less than a day, so let's do 1-31 days, incrementing by 4.
interval.num <- seq(1, 31, by = 4)
interval <- paste(interval.num, "days")
# Now I want to get the probability distributions for edge gain/loss, given two steps of history.
histdfs <- vector(mode = "list", length = length(interval))
for(i in 1:length(interval)){
graphs <- vultureUtils::makeGraphs(edges = toUse, interval = interval[i],
dateTimeStart = "2020-01-01 00:00:00",
dateTimeEnd = "2021-09-01 11:59:00",
weighted = FALSE, allVertices = TRUE)$graphs
probs <- vultureUtils::computeProbs(graphs)
histdfs[[i]] <- probs
}
# Make the list into a single data frame. First, add the time interval to each list element:
histdfs <- map2(.x = histdfs, .y = interval, .f = function(.x, .y){
.x$interval = factor(.y, levels = .y)
return(.x)
})
# Then, bind the list into a data frame for plotting.
sensData <- data.table::rbindlist(histdfs) %>%
mutate(earlyDate = lubridate::ymd(earlyDate))
Now we can use the data from this sensitivity analysis to graph two
things: the probability distribution for add00,
add10, lose01, and lose11; and
the change in each of these metrics over time (to see if we need to add
time dependence to the model.)
First, let’s look at the probability distributions.
sensData %>%
ggplot(aes(x = prob, col = interval))+
geom_density(size = 1)+
facet_wrap(~type)+
theme_minimal()+
scale_color_viridis_d()
## Warning: Removed 230 rows containing non-finite values (stat_density).
Okay, cool. We don’t see a ton of difference between the different time windows, but in general the 1- and 5-day intervals seem to look a bit more distinct. Leaning toward going with those anyway, but let’s also take a look at these probabilities over time.
sensData %>%
ggplot(aes(x = earlyDate, y = prob, col = interval))+
geom_smooth(se = FALSE)+
facet_wrap(~type)+
theme_minimal()+
scale_color_viridis_d()
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## Warning: Removed 230 rows containing non-finite values (stat_smooth).
This is a bit more chaotic. Seems like there might be some seasonality going on here. But I don’t see any clear patterns emerging, and the darker lines (shorter time windows) seem to be a bit more steady and constant than the longer time windows, which is another argument for picking a shorter time window.
Importantly, I don’t see any super strong trends over time here. This is nice because it means I can carry on modeling these probabilities as distributions, without adding time dependence.
Now, let’s take a look at how the network density changes over time. This will be another way to decide which time interval to use for the model.
# Now let's take a look at the network densities and see how they change.
intervalGraphs <- lapply(interval, function(x){
graphs <- vultureUtils::makeGraphs(edges = toUse, interval = x,
dateTimeStart = "2020-01-01 00:00:00",
dateTimeEnd = "2022-09-01 11:59:00",
weighted = FALSE, allVertices = TRUE)$graphs
})
# compile the density information
densities <- map2(.x = intervalGraphs, .y = interval, .f = function(.x, .y){
lapply(.x, igraph::edge_density) %>%
unlist() %>%
as.data.frame() %>%
setNames(., "density") %>%
mutate(earlyDate = row.names(.),
interval = factor(.y, levels = .y),
earlyDate = lubridate::ymd(earlyDate))
}) %>%
data.table::rbindlist()
Time to visualize the density information.
# plot the density information
densities %>%
ggplot(aes(x = earlyDate, y = density, col = interval))+
geom_smooth(se = FALSE)+
theme_minimal()+
scale_color_viridis_d()
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
densities %>%
ggplot(aes(x = earlyDate, y = density, col = interval))+
geom_point(alpha = 0.5)+
geom_smooth(se = FALSE)+
theme_minimal()+
scale_color_viridis_d()+
facet_wrap(~interval)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
In general, it looks like the pattern gets a lot noisier as the time window gets bigger, which is kind of interesting and counter-intuitive. I want the graph density to remain roughly constant, so I’m going to look at choosing one of the shorter time windows. I’m more inclined to go with 5 days (or 5, for simplicity) rather than 1 day.
Let’s look at the distribution of densities for 1 day and 5 days.
densities %>%
filter(interval %in% c("1 days", "5 days")) %>%
ggplot(aes(x = density))+
geom_density()+
theme_minimal()+
facet_wrap(~interval)
When we have a 1-day time window, the reason the density is so consistent is that it’s so close to zero, because almost no edges are present at any given time. It’s just small groups of individuals feeding together each day.
5 days is looking like a more reasonable distribution. Note that this assumes we’re allowing isolated nodes and that most individuals aren’t connected on any given day. Hopefully adding the density parameters to the model will help with that.
Calculate some parameter values to use in the model:
# Get the mean network density for 5 days
int <- "5 days"
density_5day <- densities %>%
filter(interval == int) %>%
pull(density) %>%
mean()
# Get beta distributions to fit each of the probabilities.
probs <- sensData %>%
dplyr::filter(!is.nan(prob),
!is.na(prob)) %>%
dplyr::mutate(prob = (prob - min(prob) + 0.001) / (max(prob) - min(prob) + 0.002)) %>%
dplyr::filter(interval == int)
fit_add00 <- fitdist(probs %>%
filter(type == "add00") %>%
pull(prob),
"beta")
fit_add10 <- fitdist(probs %>%
filter(type == "add10") %>%
pull(prob),
"beta")
fit_lose01 <- fitdist(probs %>%
filter(type == "lose01") %>%
pull(prob),
"beta")
fit_lose11 <- fitdist(probs %>%
filter(type == "lose11") %>%
pull(prob),
"beta")
# Visualize each of the distributions to examine fit.
plot(fit_add00, las = 1)
plot(fit_add10, las = 1)
plot(fit_lose01, las = 1)
plot(fit_lose11, las = 1)
I guess I’ll use these beta distributions… Let’s save them as a list.
betaDistributions <- list("add00" = fit_add00$estimate,
"add10" = fit_add10$estimate,
"lose01" = fit_lose01$estimate,
"lose11" = fit_lose11$estimate,
"metadata" = paste("these are the beta distributions for `add` and `lose` model parameters, using an interval of", int, "over the date range", startDate, "to", endDate))
save(betaDistributions, file = "data/betaDistributions.Rda")
The mean network density is 0.1185458.
We’re working with only the 5-day data here.
graphs <- intervalGraphs[[2]]
Compute individuals’ degree over time.
degrees <- lapply(graphs, igraph::degree)
# make a data frame
degreeData <- map2(.x = degrees, .y = names(degrees), .f = function(.x, .y){
df <- as.data.frame(.x) %>%
setNames(., "degree") %>%
mutate(trackId = row.names(.)) %>%
mutate(earlyDate = lubridate::ymd(.y))
}) %>%
data.table::rbindlist()
Plot the results:
degreeData %>%
ggplot(aes(x = earlyDate, y = degree, col = trackId))+
geom_smooth(se = FALSE)+
theme_minimal()+
theme(legend.position = "none")
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
At first glance, individuals’ degrees seem to be relatively stable over time.
What about the degree distribution of the population over time?
degreeData %>%
ggplot(aes(x = degree, col = as.factor(earlyDate)))+
geom_density()+
theme_minimal()+
theme(legend.position = "none")+
scale_color_viridis_d()
This isn’t very informative. What happens if we remove individuals with degree 0, i.e. individuals not participating in feeding interactions during this time slice?
degreeData %>%
filter(degree != 0) %>%
ggplot(aes(x = degree, col = as.factor(earlyDate)))+
geom_density()+
theme_minimal()+
theme(legend.position = "none")+
scale_color_viridis_d()
Huh, even after removing the individuals with degree 0, we still have an incredibly right-skewed degree distribution.